from pyprojroot import hereDeveloping the employment heatmap visualization
Setup
Parameters
LABOUR_DATA_FILE = here() / "data" / "14100355.csv"
FIGURE_THEME_SIZE = (8, 6)
FILTER_YEAR = (2018, 2025)Libraries
import polars as pl
import polars.selectors as cs
from mizani.bounds import squish
import mizani.labels as ml
import mizani.breaks as mb
import textwrap
from pyprojroot import here
from great_tables import GT, md, html
from plotnine import *
from labourcan.data_processing import read_labourcan,calculate_centered_rankRead data
read_labourcan returns a polars with:
- Unused columns removed
- Filtered to seasonally adjusted estimates only
- Filtered to Canada level estimates
- Additional
YEAR,MONTH, andDATE_YMDcolumns extracted fromREF_DATE - Sorted chronologically by year and month
See labour.qmd for details on data processing.
labour = read_labourcan(LABOUR_DATA_FILE)
labour_processed = calculate_centered_rank(labour)Heatmap of Employment Numbers
Let’s take a stab at a first visual.
(
ggplot(
(
labour_processed.filter(
pl.col("YEAR") >= FILTER_YEAR[0],
pl.col("YEAR") <= FILTER_YEAR[1]
)
), aes(x="DATE_YMD", y="centered_rank_across_industry", color="PDIFF"))
+ geom_point(shape="s")
+ theme_tufte()
+ theme(figure_size=FIGURE_THEME_SIZE, axis_text_x=element_text(angle=90))
+ scale_color_gradient2(
limits=(-0.01, 0.01), low="#ff0000ff", high="#0000dbff", midpoint=0, oob=squish)
)geom_point or geom_tile
It looks good. but the whitespace between each point is distracting. I could make the point size larger, but the ratio of point size to range of the x and y axis, as well as the figure size all will determine ultimately how much whitespace remains between each point.
We can use geom_tile instead, which will plot rectangles specified by a center point.
labour_processed_cat = labour_processed.drop_nulls(
['centered_rank_across_industry'])
order = (
labour_processed_cat.select('centered_rank_across_industry').unique().sort(
'centered_rank_across_industry')
.to_series()
.cast(pl.Utf8)
.to_list()
)
labour_processed_cat = (
labour_processed_cat.with_columns(
pl.col('centered_rank_across_industry').cast(
pl.Utf8).cast(pl.Enum(categories=order)).alias('centered_rank_cat')
)
)
(
ggplot((
labour_processed_cat.filter(
pl.col("YEAR") >= FILTER_YEAR[0],
pl.col("YEAR") <= FILTER_YEAR[1]
)
), aes(x="DATE_YMD", y="centered_rank_across_industry", fill="PDIFF"))
+ geom_tile(height=0.95) # whitespace between tiles, vertically
+ theme_tufte()
+ theme(
figure_size=FIGURE_THEME_SIZE,
axis_text_x=element_text(angle=90)
)
+ scale_fill_gradient2(
limits=(-0.01, 0.01), low="#ff0000ff", high="#0000dbff", midpoint=0, oob=squish)
)This is looking pretty good. I added height = 0.95 to add some whitespace between tiles vertically. I actually wanted to remove whitespace completely, but I discovered width for geom_tile doesn’t work the same as it does for ggplot2. If I set width=1 it seems to make the tiles smaller, instead of wider.
Explicit color mapping with scale_color_manual
I am fairly happy with the scale_fill_gradient2 used with squish. We get a really nice palette that’s centered around 0. However scale_fill_gradient2 is limited to 3 colors (high, midpoint, low), which is not quite enable the more dynamic color palette that I’m seeking.
To be more explicit with the colors, I will bin the PDIFF and map colors manually using scale_fill_manual
Bin with polars.Series.cut
labour_processed_cutted = (
labour_processed_cat.with_columns(
pl.col("PDIFF")
.cut(
[
-0.05,
-0.025,
-0.012,
-0.0080,
-0.0040,
0,
0.0040,
0.0080,
0.012,
0.025,
0.05,
]
)
.alias("PDIFF_BINNED")
)
.with_columns(
pl.when(pl.col("PDIFF") == 0)
.then(pl.lit("0"))
.otherwise(pl.col("PDIFF_BINNED"))
.alias("PDIFF_BINNED")
)
.sort("PDIFF")
.with_columns(pl.col("PDIFF_BINNED"))
)
labour_processed_cutted.group_by("PDIFF_BINNED").len()| PDIFF_BINNED | len |
|---|---|
| cat | u32 |
| "(0.008, 0.012]" | 1021 |
| "(-0.025, -0.012]" | 892 |
| "(0.012, 0.025]" | 1292 |
| "(-0.004, 0]" | 1999 |
| "(0.025, 0.05]" | 315 |
| … | … |
| "(-0.012, -0.008]" | 717 |
| "(0, 0.004]" | 2624 |
| "(0.05, inf]" | 58 |
| "(-0.008, -0.004]" | 1201 |
| "(-inf, -0.05]" | 47 |
(
ggplot(
(
labour_processed_cutted.filter(
pl.col("YEAR") >= FILTER_YEAR[0], pl.col("YEAR") <= FILTER_YEAR[1]
)
),
aes(x="DATE_YMD", y="centered_rank_cat", fill="PDIFF_BINNED"),
)
+ geom_tile(height=0.95) # whitespace between tiles, vertically
+ theme_tufte()
+ theme(figure_size=FIGURE_THEME_SIZE, axis_text_x=element_text(angle=90))
)scale_fill_manual for explicit color mapping
Now we need to order the levels, and map explicit colors
We will make PDIFF=0% to be gray, positive values to have a green and blue colors (job growth = good), and negative values to have warmer (alarming, bad) colors.
order = (
labour_processed_cutted.drop_nulls()
.sort("PDIFF")
.select(pl.col("PDIFF_BINNED"))
.unique(maintain_order=True)
.to_series()
.to_list()
)
labour_processed_cutted_ordered = labour_processed_cutted.with_columns(
pl.col("PDIFF_BINNED").cast(pl.Enum(order))
)
color_mapping = {
"(-inf, -0.05]": "#d82828ff",
"(-0.05, -0.025]": "#fa6f1fff",
"(-0.025, -0.012]": "#f1874aff",
"(-0.012, -0.008]": "#f1b274ff",
"(-0.008, -0.004]": "#FEE08B",
"(-0.004, 0]": "#FFFFBF",
"0": "#a8a8a8ff",
"(0, 0.004]": "#E6F5D0",
"(0.004, 0.008]": "#bce091ff",
"(0.008, 0.012]": "#9ad65fff",
"(0.012, 0.025]": "#78b552ff",
"(0.025, 0.05]": "#5cb027ff",
"(0.05, inf]": "#1f6fc6ff",
}
(
ggplot(
(
labour_processed_cutted.filter(
pl.col("YEAR") >= FILTER_YEAR[0], pl.col("YEAR") <= FILTER_YEAR[1]
)
),
aes(x="DATE_YMD", y="centered_rank_across_industry", fill="PDIFF_BINNED"),
)
+ geom_tile(color="white")
# + geom_point(shape="s")
+ theme_tufte()
+ theme(figure_size=FIGURE_THEME_SIZE, axis_text_x=element_text(angle=90))
+ scale_fill_manual(values=color_mapping, breaks=order)
)That looks great. The power of scale_fill_manual enables much more control over the color palette. However, the cost was that it takes a lot more effort and lines of code to create a custom mapping.
The legend
…is extremely accurate, however we are going to simplify it and nicer to look at.
First let’s make the text more concise: we don’t need every bin to be labelled, and instead of listing the range, we can just describe the midpoint.
legend_labels = [
"-5%", # the ends can be labelled with the boundary e.g. implies <-5%
"",
"",
"-1%",
"",
"",
"No change",
"",
"",
"",
"1%",
"",
"5%",
]
(
ggplot(
labour_processed_cutted.filter(
pl.col("YEAR") >= FILTER_YEAR[0], pl.col("YEAR") <= FILTER_YEAR[1]
),
aes(x="DATE_YMD", y="centered_rank_across_industry", fill="PDIFF_BINNED"),
)
+ geom_tile(color="white")
+ theme_tufte()
+ theme(
figure_size=FIGURE_THEME_SIZE,
axis_text_x=element_text(angle=90),
legend_justification_right=1,
legend_position="right",
legend_text_position="right",
legend_title=element_blank(),
legend_key_spacing=0,
legend_key_width=10,
legend_key_height=10,
legend_text=element_text(size=8),
)
+ scale_fill_manual(values=color_mapping, breaks=order, labels=legend_labels)
)Looks much better than my first attempt with a horizontal legend
Text and fonts
Next up is the text and fonts. I played with a few fonts on google fonts before settling on two.
First, install the fonts:
FONT_PRIMARY = "Playfair Display"
FONT_SECONDARY = "Lato"
import mpl_fontkit as fk
fk.install(FONT_PRIMARY)
fk.install(FONT_SECONDARY)Font name: `Playfair Display`
Font name: `Lato`
plotnine breaks and labels for the scales can be easily adjusted using mizani, which is like the scales equivalent to ggplot2
We’re going to use mizani.breaks.breaks_date_width to put breaks for each year, and mizani.labels.label_date to drop the “month” part of the date.
import mizani.labels as ml
import mizani.breaks as mb
plot = (
ggplot(
labour_processed_cutted.filter(
pl.col("YEAR") >= FILTER_YEAR[0], pl.col("YEAR") <= FILTER_YEAR[1]
),
aes(x="DATE_YMD", y="centered_rank_across_industry", fill="PDIFF_BINNED"),
)
+ geom_tile(color="white", height=0.95)
+ theme_tufte()
+ theme(
text=element_text(family=FONT_PRIMARY),
figure_size=FIGURE_THEME_SIZE,
axis_text_y=element_text(family=FONT_SECONDARY),
axis_text_x=element_text(family=FONT_SECONDARY),
axis_title_y=element_text(weight=300),
legend_justification_right=1,
legend_position="right",
legend_text_position="right",
legend_title_position="top",
legend_key_spacing=0,
legend_key_width=15,
legend_key_height=15,
legend_text=element_text(size=8, family=FONT_SECONDARY),
legend_title=element_blank(),
plot_title=element_text(ha="left"),
plot_subtitle=element_text(
ha="left", margin={"b": 1, "units": "lines"}),
)
+ scale_fill_manual(values=color_mapping,
breaks=order, labels=legend_labels)
+ guides(fill=guide_legend(ncol=1, reverse=True))
+ scale_x_datetime(
labels=ml.label_date("%Y"), # Format labels to show only the year
expand=(0, 0),
breaks=mb.breaks_date_width("1 years"),
)
+ labs(
title="Sector Shifts: Where Canada's Jobs Are Moving",
subtitle=textwrap.fill(
"Track the number of industries gaining or losing jobs each month. Boxes are shaded based on percentage change from previous month in each industry's employment levels.",
width=75,
),
x="",
y="< SECTORS FALLING SECTORS RISING >",
)
)
plotHighlighting an Industry
For more deeper insights, I would like to see where each individual ranks in the graphic.
labour_processed_cutted.select('Industry').unique().to_series().to_list()
INDUSTRY = 'Wholesale and retail trade [41, 44-45]'
plot_data_subsetted = labour_processed_cutted.filter(
pl.col("YEAR") >= FILTER_YEAR[0],
pl.col("YEAR") <= FILTER_YEAR[1],
pl.col('Industry') == INDUSTRY
)
(
plot
+ geom_point(data=plot_data_subsetted, color='black', fill='black')
)Line plot of unemployment
Appendix
Things that didn’t work
This section is a non-exhaustive list of design elements I wasn’t able to solve with plotnine
Horizontal legend with horizontal legend text
Initially I wanted a horizontal legend for the colors. But in order to remove the whitespace between keys, I discovered that the text needs to be smaller than the legend keys, otherwise they “push” the legend keys apart in uneven manner. I attempted to (unsuccesfully) address this by making the legend text small, eliminating as much text as possible (e.g. removing the “%” characters for -0.50 and 0.50), and lastly increasing the legend key size.
But it still didn’t really work out the way I hoped, so I stuck with a vertical legend instead.